Preprocessing QC statistics ¶

July 2025¶

In [1]:
import os
import sys

NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "neuronsDay8_new", "logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', 'ManuscriptFinalData_80pct', "neuronsDay8_new", 'QC_figures')


sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")

print(os.environ['NOVA_HOME'])
import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript

from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
                                                run_calc_hist_new
                                                
from tools.preprocessing_tools.qc_reports.qc_config import new_d8_panels, new_d8_markers, new_d8_marker_info, new_d8_cell_lines, new_d8_cell_lines_to_cond,\
                                    new_d8_cell_lines_for_disp, new_d8_reps, new_d8_line_colors, new_d8_lines_order, new_d8_custom_palette,\
                                    new_d8_expected_dapi_raw
%load_ext autoreload
%autoreload 2
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
/home/projects/hornsteinlab/Collaboration/NOVA
In [2]:
# choose batches
batches = ['batch1', 'batch2', 'batch3', 'batch7', 'batch8', 'batch9', 'batch10']#['batch1', 'batch2', 'batch3', 'batch7', 'batch8', 'batch9', 'batch10']
batches_clean = [b+'CLEAN' for b in batches]
batches
Out[2]:
['batch1', 'batch2', 'batch3', 'batch7', 'batch8', 'batch9', 'batch10']
In [3]:
df = log_files_qc(LOGS_PATH, batches, filename_split='-',site_location=0)

df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch8
reading logs of batch3
reading logs of batch9
reading logs of batch10
reading logs of batch2
reading logs of batch1
reading logs of batch7

Total of 15 files were read.
Before dup handeling  (1147717, 21)
After duplication removal #1: (1071227, 22)
After duplication removal #2: (1071227, 22)

PAY ATTENTION!!!! df.site_num: r06c02f14, can be defined using filename_split & site_location

Actual Files Validation¶

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [4]:
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'OPERA_indi_sorted')

raws = run_validate_folder_structure(root_directory_raw, False, new_d8_panels, new_d8_markers,PLOT_PATH,new_d8_marker_info,
                                    new_d8_cell_lines_to_cond, new_d8_reps, new_d8_cell_lines_for_disp, new_d8_expected_dapi_raw,
                                     batches=batches,expected_count=250,check_antibody=False)
batch1
Folder structure is invalid. Missing 11 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelA
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelB
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelC
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelD
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelE
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelF
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelG
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelH
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelI
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelJ
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch1/SNCA/panelL
No bad files are found.
Total Sites:  163997
========
batch2
Folder structure is invalid. Missing 1 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch2/SNCA
1 files are bad:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch2/WT/panelA/Untreated/rep2/FMRP, Thumbs.db, ext is .db
Total Sites:  164001
========
batch3
Folder structure is invalid. Missing 1 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch3/SNCA
No bad files are found.
Total Sites:  163996
========
batch7
Folder structure is valid.
No bad files are found.
Total Sites:  184494
========
batch8
Folder structure is valid.
No bad files are found.
Total Sites:  184500
========
batch9
Folder structure is valid.
1 files are bad:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/OPERA_indi_sorted/batch9/WT/panelB/Untreated/rep2/CD41, Thumbs.db, ext is .db
Total Sites:  184493
========
batch10
Folder structure is valid.
No bad files are found.
Total Sites:  184500
========
====================

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [5]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct',
                              'neuronsDay8_new')
procs = run_validate_folder_structure(root_directory_proc, True, new_d8_panels, new_d8_markers,PLOT_PATH,new_d8_marker_info,
                                    new_d8_cell_lines_to_cond, new_d8_reps, new_d8_cell_lines_for_disp, new_d8_expected_dapi_raw,
                                     batches=batches_clean,expected_count=250, check_antibody=False)
batch1CLEAN
Folder structure is invalid. Missing 1 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay8_new/batch1CLEAN/SNCA
No bad files are found.
Total Sites:  145369
========
batch2CLEAN
Folder structure is invalid. Missing 1 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay8_new/batch2CLEAN/SNCA
No bad files are found.
Total Sites:  144929
========
batch3CLEAN
Folder structure is invalid. Missing 1 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay8_new/batch3CLEAN/SNCA
No bad files are found.
Total Sites:  145953
========
batch7CLEAN
Folder structure is valid.
No bad files are found.
Total Sites:  154031
========
batch8CLEAN
Folder structure is valid.
No bad files are found.
Total Sites:  168274
========
batch9CLEAN
Folder structure is valid.
No bad files are found.
Total Sites:  153650
========
batch10CLEAN
Folder structure is valid.
No bad files are found.
Total Sites:  131289
========
====================

Difference between Raw and Processed¶

In [6]:
display_diff(batches, raws, procs, PLOT_PATH)
batch1
========
batch2
========
batch3
========
batch7
========
batch8
========
batch9
========
batch10
========

Variance in each batch (of processed files)¶

In [7]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct',
                              'neuronsDay8_new')
for batch in batches_clean:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, num_markers=30)
    print(f'{batch} var: ',var)
batch1CLEAN var:  0.05012238092547438
batch2CLEAN var:  0.052469953520045735
batch3CLEAN var:  0.05186614281187016
batch7CLEAN var:  0.04970548078871655
batch8CLEAN var:  0.05053652939497635
batch9CLEAN var:  0.05068049787840124
batch10CLEAN var:  0.05114243272228387

Preprocessing Filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [8]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, new_d8_line_colors, new_d8_panels, new_d8_reps,
                                                         vmax=250,
                                                        to_ignore={'cell_line_cond':'SNCA','batch':['batch1','batch2','batch3']})

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [9]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, new_d8_line_colors, new_d8_panels, new_d8_reps,
                                                          figsize=(7,5), to_ignore={'cell_line_cond':['SNCA'],'batch':['batch1','batch2','batch3']})

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.

In [10]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, new_d8_line_colors, new_d8_panels, new_d8_reps,
                                                    figsize=(7,5),to_ignore={'cell_line_cond':['SNCA'],'batch':['batch1','batch2','batch3']})

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [11]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling, new_d8_markers, figsize=(7,8))

Statistics About the Processed Files¶

In [12]:
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, new_d8_markers)

Total tiles¶

In [13]:
total_sum[~total_sum.marker.str.contains('TIA', regex=True)].n_valid_tiles.sum()
Out[13]:
9196363
In [14]:
## Total tiles in wt lines
total_sum[(~total_sum.marker.str.contains('TIA', regex=True))&
         (total_sum.cell_line_cond.isin(['WT stress', 'WT Untreated']))].n_valid_tiles.sum()
Out[14]:
2535188
In [15]:
## Total tiles in untreated lines
total_sum[(~total_sum.marker.str.contains('TIA', regex=True))&
         ((~total_sum.cell_line_cond.str.contains('WT')) | (total_sum.cell_line_cond=='WT Untreated'))].n_valid_tiles.sum()
Out[15]:
7923051

Total whole nuclei in tiles¶

In [16]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[16]:
1849083.0

Total nuclei in sites¶

In [17]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[17]:
8102443.0
In [18]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch1
count 6.560000e+02 656.000000 656.000000 6.560000e+02
mean 2.260840e+03 22.608399 1419.608232 5.842776e+03
std 7.071879e+02 7.071879 523.800726 1.797175e+03
min 4.700000e+02 4.700000 263.000000 1.131000e+03
25% 1.691500e+03 16.915000 1077.500000 4.754500e+03
50% 2.300000e+03 23.000000 1384.000000 5.839000e+03
75% 2.791000e+03 27.910000 1849.750000 7.186250e+03
max 3.763000e+03 37.630000 2646.000000 1.027900e+04
sum 1.483111e+06 NaN 931263.000000 3.832861e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch10
count 7.380000e+02 738.000000 738.000000 7.380000e+02
mean 1.631982e+03 16.319824 1137.668022 4.661591e+03
std 1.060984e+03 10.609838 822.340207 2.972323e+03
min 6.000000e+00 0.060000 0.000000 1.800000e+01
25% 7.217500e+02 7.217500 435.250000 2.019500e+03
50% 1.567000e+03 15.670000 1064.500000 4.841500e+03
75% 2.535750e+03 25.357500 1668.000000 7.075500e+03
max 3.646000e+03 36.460000 3335.000000 1.107800e+04
sum 1.204403e+06 NaN 839599.000000 3.440254e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch2
count 6.530000e+02 653.000000 653.000000 6.530000e+02
mean 1.597243e+03 15.972435 879.632466 3.976789e+03
std 4.735530e+02 4.735530 268.013557 1.037326e+03
min 1.830000e+02 1.830000 67.000000 4.770000e+02
25% 1.285000e+03 12.850000 689.000000 3.362000e+03
50% 1.700000e+03 17.000000 903.000000 4.106000e+03
75% 1.937000e+03 19.370000 1069.000000 4.688000e+03
max 2.511000e+03 25.110000 1670.000000 6.325000e+03
sum 1.043000e+06 NaN 574400.000000 2.596843e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch3
count 6.560000e+02 656.000000 656.000000 6.560000e+02
mean 1.672093e+03 16.720930 890.144817 4.118247e+03
std 5.010009e+02 5.010009 272.753277 1.072282e+03
min 3.370000e+02 3.370000 163.000000 7.530000e+02
25% 1.358250e+03 13.582500 715.750000 3.512750e+03
50% 1.751000e+03 17.510000 892.000000 4.231500e+03
75% 2.021250e+03 20.212500 1077.250000 4.890500e+03
max 2.802000e+03 28.020000 1696.000000 6.807000e+03
sum 1.096893e+06 NaN 583935.000000 2.701570e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch7
count 7.330000e+02 733.000000 733.000000 7.330000e+02
mean 1.954674e+03 19.546739 1277.364256 5.774357e+03
std 7.899062e+02 7.899062 570.143431 2.020152e+03
min 5.000000e+00 0.050000 2.000000 1.500000e+01
25% 1.464000e+03 14.640000 911.000000 4.668000e+03
50% 2.089000e+03 20.890000 1227.000000 6.048000e+03
75% 2.501000e+03 25.010000 1590.000000 7.023000e+03
max 3.571000e+03 35.710000 3200.000000 9.793000e+03
sum 1.432776e+06 NaN 936308.000000 4.232604e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch8
count 7.380000e+02 738.000000 738.000000 7.380000e+02
mean 2.165144e+03 21.651436 1331.829268 5.985867e+03
std 6.710593e+02 6.710593 378.081507 1.386811e+03
min 1.900000e+02 1.900000 130.000000 4.830000e+02
25% 1.826000e+03 18.260000 1078.250000 5.037500e+03
50% 2.281500e+03 22.815000 1316.500000 6.050000e+03
75% 2.656000e+03 26.560000 1607.000000 7.153000e+03
max 3.458000e+03 34.580000 2327.000000 9.124000e+03
sum 1.597876e+06 NaN 982890.000000 4.417570e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch9
count 7.350000e+02 735.000000 735.000000 7.350000e+02
mean 2.093129e+03 20.931293 1314.273469 5.794131e+03
std 8.432317e+02 8.432317 523.021712 1.994131e+03
min 0.000000e+00 0.000000 8.000000 1.100000e+01
25% 1.413000e+03 14.130000 999.500000 4.595500e+03
50% 2.302000e+03 23.020000 1322.000000 6.128000e+03
75% 2.758000e+03 27.580000 1656.500000 7.215000e+03
max 3.634000e+03 36.340000 2845.000000 9.762000e+03
sum 1.538450e+06 NaN 965991.000000 4.258686e+06
expected_count 4.500000e+02 450.000000 450.000000 4.500000e+02
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 4.909000e+03 4909.000000 4.909000e+03 4.909000e+03
mean 1.914139e+03 19.141391 1.184434e+03 5.190546e+03
std 7.936189e+02 7.936189 5.542821e+02 2.048276e+03
min 0.000000e+00 0.000000 0.000000e+00 1.100000e+01
25% 1.390000e+03 13.900000 8.030000e+02 3.881000e+03
50% 1.976000e+03 19.760000 1.133000e+03 5.174000e+03
75% 2.484000e+03 24.840000 1.521000e+03 6.658000e+03
max 3.763000e+03 37.630000 3.335000e+03 1.107800e+04
sum 9.396509e+06 NaN 5.814386e+06 2.548039e+07
expected_count 4.500000e+02 450.000000 4.500000e+02 4.500000e+02

Show Total Tile Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [19]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of tiles', show_sum=True, figsize=(6,8))
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)

Show Total Whole Cell Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [20]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap[to_heatmap.batch=='batch7'], extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of whole cells', show_sum=True, figsize=(6,8))
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:391: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)

Show Cell Count Statistics per Batch¶

In [21]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, new_d8_lines_order, new_d8_custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)')

plot_cell_count(df_no_empty_sites, new_d8_lines_order, new_d8_custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site')

plot_cell_count(df_no_empty_sites, new_d8_lines_order, new_d8_custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site')
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:618: UserWarning: The palette list has more values (10) than needed (9), which may not be intended.
  c = sns.barplot(data=batch, x='rep', hue='cell_line_cond', y=y, hue_order = order,

Show Tiles per Site Statistics¶

In [22]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[22]:
cell_line_cond
FUSHeterozygous    7.729141
FUSHomozygous      8.147912
FUSRevertant       7.297871
OPTN               8.778378
SNCA               6.696576
TBK1               7.475947
TDP43              9.103717
WT Untreated       9.866203
WT stress          9.851510
Name: n_valid_tiles, dtype: float64
In [23]:
plot_catplot(df_dapi, new_d8_custom_palette,new_d8_reps, x='n_valid_tiles', x_title='valid tiles count', batches=batches)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1058: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']

Show Mean of cell count in valid tiles¶

In [24]:
plot_hm_of_mean_cell_count_per_tile(df_dapi, split_by='rep', rows='cell_line', columns='panel', vmax=3)
In [25]:
df_dapi[['cells_count_in_valid_tiles_mean']].mean()
Out[25]:
cells_count_in_valid_tiles_mean    1.744963
dtype: float64
In [26]:
df_dapi[['site_cell_count']].mean()
Out[26]:
site_cell_count    22.841799
dtype: float64

Assessing Staining Reproducibility and Outliers¶

In [27]:
# for batch in batches:
#     print(batch)
#     #batch_num = batch.replace('batch',"")
#     run_calc_hist_new(batch,new_d8_cell_lines_for_disp, new_d8_markers, root_directory_raw, root_directory_proc,
#                            hist_sample=10,sample_size_per_markers=200, ncols=7, nrows=4)
#     print("="*30)
In [28]:
# # save notebook as HTML ( the HTML will be saved in the same folder the original script is)
# display(Javascript('IPython.notebook.save_checkpoint();'))
# os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_new_d8_80pct.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_new_d8_80pct.html')